/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.nutch.parse;

import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;

import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.plugin.Extension;
import org.apache.nutch.plugin.ExtensionPoint;
import org.apache.nutch.plugin.PluginRepository;
import org.apache.nutch.plugin.PluginRuntimeException;
import org.apache.nutch.storage.WebPage;
import org.apache.nutch.util.ObjectCache;
import org.w3c.dom.DocumentFragment;

/** Creates and caches {@link ParseFilter} implementing plugins. */
public class ParseFilters {

	private ParseFilter[] parseFilters;

	public static final String HTMLPARSEFILTER_ORDER = "htmlparsefilter.order";

	public ParseFilters(Configuration conf) {
		String order = conf.get(HTMLPARSEFILTER_ORDER);
		ObjectCache objectCache = ObjectCache.get(conf);
		this.parseFilters = (ParseFilter[]) objectCache
				.getObject(ParseFilter.class.getName());
		if (parseFilters == null) {
			/*
			 * If ordered filters are required, prepare array of filters based
			 * on property
			 */
			String[] orderedFilters = null;
			if (order != null && !order.trim().equals("")) {
				orderedFilters = order.split("\\s+");
			}
			HashMap<String, ParseFilter> filterMap = new HashMap<String, ParseFilter>();
			try {
				ExtensionPoint point = PluginRepository.get(conf)
						.getExtensionPoint(ParseFilter.X_POINT_ID);
				if (point == null)
					throw new RuntimeException(
							ParseFilter.X_POINT_ID + " not found.");
				Extension[] extensions = point.getExtensions();
				for (int i = 0; i < extensions.length; i++) {
					Extension extension = extensions[i];
					ParseFilter parseFilter = (ParseFilter) extension
							.getExtensionInstance();
					if (!filterMap
							.containsKey(parseFilter.getClass().getName())) {
						filterMap.put(parseFilter.getClass().getName(),
								parseFilter);
					}
				}
				ParseFilter[] htmlParseFilters = filterMap.values()
						.toArray(new ParseFilter[filterMap.size()]);
				/*
				 * If no ordered filters required, just get the filters in an
				 * indeterminate order
				 */
				if (orderedFilters == null) {
					objectCache.setObject(ParseFilter.class.getName(),
							htmlParseFilters);
				}
				/* Otherwise run the filters in the required order */
				else {
					ArrayList<ParseFilter> filters = new ArrayList<ParseFilter>();
					for (int i = 0; i < orderedFilters.length; i++) {
						ParseFilter filter = filterMap.get(orderedFilters[i]);
						if (filter != null) {
							filters.add(filter);
						}
					}
					objectCache.setObject(ParseFilter.class.getName(),
							filters.toArray(new ParseFilter[filters.size()]));
				}
			} catch (PluginRuntimeException e) {
				throw new RuntimeException(e);
			}
			this.parseFilters = (ParseFilter[]) objectCache
					.getObject(ParseFilter.class.getName());
		}
	}

	/** Run all defined filters. */
	public Parse filter(String url, WebPage page, Parse parse,
			HTMLMetaTags metaTags, DocumentFragment doc) {

		// loop on each filter
		for (ParseFilter parseFilter : parseFilters) {
			// call filter interface
			parse = parseFilter.filter(url, page, parse, metaTags, doc);

			// any failure on parse obj, return
			if (!ParseStatusUtils.isSuccess(parse.getParseStatus())) {
				return parse;
			}
		}

		return parse;
	}

	public Collection<WebPage.Field> getFields() {
		Collection<WebPage.Field> fields = new HashSet<WebPage.Field>();
		for (ParseFilter htmlParseFilter : parseFilters) {
			Collection<WebPage.Field> pluginFields = htmlParseFilter
					.getFields();
			if (pluginFields != null) {
				fields.addAll(pluginFields);
			}
		}
		return fields;
	}
}
